In [3]:
import nltk
import pandas as pd

NTLK Documentation

 Understanding how NTLK works


In [4]:
sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""

In [5]:
tokens = nltk.word_tokenize(sentence)

In [6]:
tokens


Out[6]:
['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [7]:
tagged = nltk.pos_tag(tokens)

In [8]:
tagged


Out[8]:
[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

So lets try to use this on out Daphne data


In [9]:
df = pd.read_csv('allPostText_test.csv')

In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 11 columns):
Unnamed: 0      100 non-null int64
Unnamed: 0.1    100 non-null int64
Date_1          100 non-null int64
Date_2          100 non-null int64
Date_3          100 non-null object
ID_page         100 non-null int64
ID_post         100 non-null int64
Link            100 non-null object
Title           100 non-null object
Txt             89 non-null object
Text            100 non-null object
dtypes: int64(6), object(5)
memory usage: 8.7+ KB

In [11]:
def vec(name):
    tokens = nltk.word_tokenize(name)
    tagged = nltk.pos_tag(tokens)
    return tagged

In [12]:
df['Text'].apply(vec).head(10)


Out[12]:
0    [(«, VB), (back, RB), (to, TO), (home, NN), (Y...
1    [(«, VB), (back, RB), (to, TO), (home, NN), (E...
2    [(«, VB), (back, RB), (to, TO), (home, NN), (I...
3    [(«, VB), (back, RB), (to, TO), (home, NN), (T...
4    [(«, VB), (back, RB), (to, TO), (home, NN), (“...
5    [(«, VB), (back, RB), (to, TO), (home, NN), («...
6    [(«, VB), (back, RB), (to, TO), (home, NN), (I...
7    [(«, VB), (back, RB), (to, TO), (home, NN), (T...
8    [(«, VB), (back, RB), (to, TO), (home, NN), («...
9    [(«, VB), (back, RB), (to, TO), (home, NN), (D...
Name: Text, dtype: object

In [13]:
df['Tags'] = df['Text'].apply(vec)

 Only consider the ones, with NNP as second item


In [14]:
def token(tags):
    mini_list = []
    for elem in tags:
        if elem[1] == 'NNP':
            mini_list.append(elem[0])
    return mini_list

In [15]:
df['People list'] = df['Tags'].apply(token)

In [16]:
df['People list'].head(10)


Out[16]:
0                                                   []
1    [Kurz, Austria’s, People’s, Party, OVP, Freedo...
2    [Naxxar, Labour, Party, Prime, Minister’s, Sun...
3    [Nationalist, Party, Nationalist, Party, Malta...
4                                              [“I, «]
5                                                  [«]
6                                                   []
7    [Toni, Bezzina, Nationalist, Party’s, MP, Robe...
8                                                  [«]
9    [David, Agius, Nationalist, Party’s, Edwin, Va...
Name: People list, dtype: object

 Not great, so lets look around for a better solution.


In [17]:
# This looks promising

sudo python -m nltk.downloader all if you have problems


In [54]:
for sent in nltk.sent_tokenize(df['Text'][1]):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
        if hasattr(chunk, 'label'):
            print(chunk.label()+',', ' '.join(c[0] for c in chunk))


PERSON, Sebastian Kurz
ORGANIZATION, OVP
ORGANIZATION, Freedom Party
ORGANIZATION, FPÖ
ORGANIZATION, Social Democrats
ORGANIZATION, People’s Party
ORGANIZATION, People’s Party
GPE, Sebastian
PERSON, Kurz
PERSON, Christian Kern
ORGANIZATION, Social Democrats
PERSON, Kurz
GSP, Austria

In [61]:
def peopled(elem):
    mini_list = []
    for sent in nltk.sent_tokenize(elem):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if hasattr(chunk, 'label'):
                p = chunk.label(), ' '.join(c[0] for c in chunk)
                mini_list.append(p)
    return mini_list

In [63]:
df['people'] = df['Text'].apply(peopled)

Only getting the peolple


In [65]:
lst = list(df['people'])

In [71]:
lst = [x for x in lst if x !=[]]

In [72]:
flat_list = [item for sublist in lst for item in sublist]

In [74]:
name_list = []
for name in flat_list:
    if name[0] == 'PERSON':
        name_list.append(name[1])

In [79]:
pd.DataFrame(name_list)[0].value_counts()


Out[79]:
Delia                   41
Adrian Delia            31
Malta                   14
Jean Pierre Debono      14
Debono                  12
Muscat                  10
Mrs Delia                8
Agius                    7
Rebecca Dimech           7
Joseph Muscat            7
David Agius              6
Anton Rea Cutajar        5
David                    5
Frank Portelli           5
Adrian Delia’s           5
Clyde Puli               4
Clyde                    4
Kristy Debono            4
Bundy                    4
Cutajar                  4
Robert Arrigo            4
Eddie Fenech Adami       3
Keith Schembri           3
Keith                    3
Kristy                   3
Borg Olivier             3
Edwin Vassallo           3
Andre Falzon             3
Fenech Adami             2
Kurt Farrugia            2
                        ..
Hubert Zammit            1
Bad                      1
Pasta Rummo              1
Toni Bezzina             1
Kevin Cassar             1
Censu L-Iswed            1
Rudyard                  1
Xaraban                  1
Mad                      1
Puli                     1
Alexander                1
Borg                     1
Malta here.The Times     1
Hang                     1
Mandalay                 1
Farrugia                 1
Censu                    1
Leonardo Fasoli          1
Manwel Dimech Street     1
Gozo                     1
Opposition               1
Chris Said               1
Rebecca                  1
Pierre                   1
Joseph Muscat.”          1
Joe                      1
Botox                    1
Beppe Fenech Adami       1
Gonzi                    1
Maze Pictures            1
Name: 0, Length: 171, dtype: int64